We add cell type information in X and look at W.
As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).
Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filter <- apply(assay(allen_core)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 11545
To speed up the computations, I will focus on the top 1,000 most variable genes.
allen_core <- allen_core[filter,]
core <- assay(allen_core)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)
detection_rate <- colSums(core>0)
coverage <- colSums(core)
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.01052424 0.8066812 0.4453172
## PC2 -0.01052424 1.00000000 -0.3601716 -0.3286485
## detection_rate 0.80668124 -0.36017158 1.0000000 0.5275845
## coverage 0.44531717 -0.32864852 0.5275845 1.0000000
print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2,
X=model.matrix(~bio))))
## user system elapsed
## 138.696 31.569 74.610
Plot the results with cells colored according to their biological condition.
par(mfrow=c(1, 2))
plot(zinb_X@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_X@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2, horiz=T)
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 0.06243371 -0.14284292 0.03336945
## W2 0.06243371 1.00000000 0.04833728 0.22845102
## gamma_mu -0.14284292 0.04833728 1.00000000 -0.25597360
## gamma_pi 0.03336945 0.22845102 -0.25597360 1.00000000
## detection_rate -0.04056153 -0.26312871 0.29942803 -0.99261966
## coverage 0.04966279 -0.07480962 0.87261478 -0.49206198
## detection_rate coverage
## W1 -0.04056153 0.04966279
## W2 -0.26312871 -0.07480962
## gamma_mu 0.29942803 0.87261478
## gamma_pi -0.99261966 -0.49206198
## detection_rate 1.00000000 0.52758451
## coverage 0.52758451 1.00000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_X)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_X))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(3, 2))
boxplot(zinb_X@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[3,], main="Beta_Pi")
abline(h=0)
par(mfrow=c(2, 2))
plot(t(zinb_X@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_X@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Xdisp <- zinbFit(core, ncores = 3, K = 2,
X=model.matrix(~bio),commondispersion=F)))
## user system elapsed
## 418.942 220.293 247.063
Plot the results with cells colored according to their biological condition.
par(mfrow=c(1, 2))
plot(zinb_Xdisp@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_Xdisp@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2,horiz = T)
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Xdisp@W[,1], W2=zinb_Xdisp@W[,2], gamma_mu = zinb_Xdisp@gamma_mu[1,], gamma_pi = zinb_Xdisp@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 0.15563821 -0.12603586 0.02071153
## W2 0.15563821 1.00000000 0.09032643 0.25316448
## gamma_mu -0.12603586 0.09032643 1.00000000 -0.22512973
## gamma_pi 0.02071153 0.25316448 -0.22512973 1.00000000
## detection_rate -0.02162210 -0.28934921 0.27311190 -0.98961610
## coverage 0.06296143 -0.08869507 0.86093195 -0.48543702
## detection_rate coverage
## W1 -0.0216221 0.06296143
## W2 -0.2893492 -0.08869507
## gamma_mu 0.2731119 0.86093195
## gamma_pi -0.9896161 -0.48543702
## detection_rate 1.0000000 0.52758451
## coverage 0.5275845 1.00000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Xdisp)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Xdisp))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(3, 2))
boxplot(zinb_Xdisp@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_Xdisp@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_Xdisp@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[3,], main="Beta_Pi")
abline(h=0)
par(mfrow=c(1, 2))
plot(t(zinb_Xdisp@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Xdisp@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_XnoneV <- zinbFit(core, ncores = 3, K = 2,
X=model.matrix(~bio),
V=matrix(0, ncol=1, nrow=NROW(core)))))
## user system elapsed
## 301.229 63.112 147.607
Plot the results with cells colored according to their biological condition.
par(mfrow=c(1, 2))
plot(zinb_XnoneV@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_XnoneV@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2, horiz=T)
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_XnoneV@W[,1], W2=zinb_XnoneV@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 detection_rate coverage
## W1 1.00000000 -0.08867641 -0.02791175 -0.1674994
## W2 -0.08867641 1.00000000 0.85434291 0.7717366
## detection_rate -0.02791175 0.85434291 1.00000000 0.5275845
## coverage -0.16749936 0.77173664 0.52758451 1.0000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_XnoneV)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_XnoneV))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(3, 2))
boxplot(zinb_XnoneV@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_XnoneV@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_XnoneV@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[3,], main="Beta_Pi")
abline(h=0)
par(mfrow=c(2, 2))
plot(t(zinb_XnoneV@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_XnoneV@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")